#include "crtMemcpy.h"
#include <memory>

namespace FOXSDK
{



#define EMMS_INSTRUCTION		__asm emms
#define _alloca16( x )					((void *)((((int)_alloca( (x)+15 )) + 15) & ~15))

	void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
		_asm { 
			mov		esi, src 
				mov		edi, dest 
				mov		ecx, count 
				shr		ecx, 3			// 8 bytes per iteration 

loop1: 
			movq	mm1,  0[ESI]	// Read in source data 
			movntq	0[EDI], mm1		// Non-temporal stores 

				add		esi, 8
				add		edi, 8
				dec		ecx 
				jnz		loop1 

		} 
		EMMS_INSTRUCTION
	}

	/*
	================
	MMX_Memcpy64B

	165MB/sec
	================
	*/
	void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
		_asm { 
			mov		esi, src 
				mov		edi, dest 
				mov		ecx, count 
				shr		ecx, 6		// 64 bytes per iteration

loop1: 
			prefetchnta 64[ESI]	// Prefetch next loop, non-temporal 
			prefetchnta 96[ESI] 

			movq mm1,  0[ESI]	// Read in source data 
			movq mm2,  8[ESI] 
			movq mm3, 16[ESI] 
			movq mm4, 24[ESI] 
			movq mm5, 32[ESI] 
			movq mm6, 40[ESI] 
			movq mm7, 48[ESI] 
			movq mm0, 56[ESI] 

			movntq  0[EDI], mm1	// Non-temporal stores 
				movntq  8[EDI], mm2 
				movntq 16[EDI], mm3 
				movntq 24[EDI], mm4 
				movntq 32[EDI], mm5 
				movntq 40[EDI], mm6 
				movntq 48[EDI], mm7 
				movntq 56[EDI], mm0 

				add		esi, 64 
				add		edi, 64 
				dec		ecx 
				jnz		loop1 
		} 
		EMMS_INSTRUCTION
	}

	/*
	================
	MMX_Memcpy2kB

	240MB/sec
	================
	*/
	void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
		unsigned char *tbuf = (unsigned char *)_alloca16(2048);
		__asm { 
			push	ebx
				mov		esi, src
				mov		ebx, count
				shr		ebx, 11		// 2048 bytes at a time 
				mov		edi, dest

loop2k:
			push	edi			// copy 2k into temporary buffer
				mov		edi, tbuf
				mov		ecx, 32

loopMemToL1: 
			prefetchnta 64[ESI] // Prefetch next loop, non-temporal
			prefetchnta 96[ESI]

			movq mm1,  0[ESI]	// Read in source data
			movq mm2,  8[ESI]
			movq mm3, 16[ESI]
			movq mm4, 24[ESI]
			movq mm5, 32[ESI]
			movq mm6, 40[ESI]
			movq mm7, 48[ESI]
			movq mm0, 56[ESI]

			movq  0[EDI], mm1	// Store into L1
				movq  8[EDI], mm2
				movq 16[EDI], mm3
				movq 24[EDI], mm4
				movq 32[EDI], mm5
				movq 40[EDI], mm6
				movq 48[EDI], mm7
				movq 56[EDI], mm0
				add		esi, 64
				add		edi, 64
				dec		ecx
				jnz		loopMemToL1

				pop		edi			// Now copy from L1 to system memory
				push	esi
				mov		esi, tbuf
				mov		ecx, 32

loopL1ToMem:
			movq mm1, 0[ESI]	// Read in source data from L1
			movq mm2, 8[ESI]
			movq mm3, 16[ESI]
			movq mm4, 24[ESI]
			movq mm5, 32[ESI]
			movq mm6, 40[ESI]
			movq mm7, 48[ESI]
			movq mm0, 56[ESI]

			movntq 0[EDI], mm1	// Non-temporal stores
				movntq 8[EDI], mm2
				movntq 16[EDI], mm3
				movntq 24[EDI], mm4
				movntq 32[EDI], mm5
				movntq 40[EDI], mm6
				movntq 48[EDI], mm7
				movntq 56[EDI], mm0

				add		esi, 64
				add		edi, 64
				dec		ecx
				jnz		loopL1ToMem

				pop		esi			// Do next 2k block
				dec		ebx
				jnz		loop2k
				pop		ebx
		}
		EMMS_INSTRUCTION
	}


	/*
	================
	idSIMD_MMX::Memcpy

	optimized memory copy routine that handles all alignment cases and block sizes efficiently
	================
	*/
	void MMXMemcpy( void *dest0, const void *src0, const int count0 ) {
		// if copying more than 16 bytes and we can copy 8 byte aligned
		if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
			unsigned char *dest = (unsigned char *)dest0;
			unsigned char *src = (unsigned char *)src0;

			// copy up to the first 8 byte aligned boundary
			int count = ((int)dest) & 7;
			memcpy( dest, src, count );
			dest += count;
			src += count;
			count = count0 - count;

			// if there are multiple blocks of 2kB
			if ( count & ~4095 ) {
				MMX_Memcpy2kB( dest, src, count );
				src += (count & ~2047);
				dest += (count & ~2047);
				count &= 2047;
			}

			// if there are blocks of 64 bytes
			if ( count & ~63 ) {
				MMX_Memcpy64B( dest, src, count );
				src += (count & ~63);
				dest += (count & ~63);
				count &= 63;
			}

			// if there are blocks of 8 bytes
			if ( count & ~7 ) {
				MMX_Memcpy8B( dest, src, count );
				src += (count & ~7);
				dest += (count & ~7);
				count &= 7;
			}

			// copy any remaining bytes
			memcpy( dest, src, count );
		} else {
			// use the regular one if we cannot copy 8 byte aligned
			memcpy( dest0, src0, count0 );
		}
	}


#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
	// The smallest copy uses the X86 "movsd" instruction, in an optimized
	// form which is an "unrolled loop".

#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
	// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
	// also using the "unrolled loop" optimization.   This code uses
	// the software prefetch instruction to get the data into the cache.

#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
	// For larger blocks, which will spill beyond the cache, it's faster to
	// use the Streaming Store instruction MOVNTQ.   This write instruction
	// bypasses the cache and writes straight to main memory.  This code also
	// uses the software prefetch instruction to pre-read the data.
	// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"

#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
	// For the largest size blocks, a special technique called Block Prefetch
	// can be used to accelerate the read operations.   Block Prefetch reads
	// one address per cache line, for a series of cache lines, in a short loop.
	// This is faster than using software prefetch.  The technique is great for
	// getting maximum read bandwidth, especially in DDR memory systems.


	fvoid AMD3DNowMemcpy( fvoid* dest , const fvoid* src , const fint32 count )
	{
		__asm {

			mov		ecx, [count]					// number of bytes to copy
			mov		edi, [dest]					// destination
			mov		esi, [src]					// source
			mov		ebx, ecx					// keep a copy of count

				cld
				cmp		ecx, TINY_BLOCK_COPY
				jb		$memcpy_ic_3				// tiny? skip mmx copy

				cmp		ecx, 32*1024				// don't align between 32k-64k because
				jbe		$memcpy_do_align			//  it appears to be slower
				cmp		ecx, 64*1024
				jbe		$memcpy_align_done
$memcpy_do_align:
			mov		ecx, 8						// a trick that's faster than rep movsb...
				sub		ecx, edi					// align destination to qword
				and		ecx, 111b					// get the low bits
				sub		ebx, ecx					// update copy count
				neg		ecx							// set up to jump into the array
				add		ecx, offset $memcpy_align_done
				jmp		ecx							// jump to array of movsb's

				align 4
				movsb
				movsb
				movsb
				movsb
				movsb
				movsb
				movsb
				movsb

$memcpy_align_done:						// destination is dword aligned
			mov		ecx, ebx					// number of bytes left to copy
				shr		ecx, 6						// get 64-byte block count
				jz		$memcpy_ic_2				// finish the last few bytes

				cmp		ecx, IN_CACHE_COPY/64		// too big 4 cache? use uncached copy
				jae		$memcpy_uc_test

				// This is small block copy that uses the MMX registers to copy 8 bytes
				// at a time.  It uses the "unrolled loop" optimization, and also uses
				// the software prefetch instruction to get the data into the cache.
				align 16
$memcpy_ic_1:							// 64-byte block copies, in-cache copy

			prefetchnta [esi + (200*64/34+192)]	// start reading ahead

			movq	mm0, [esi+0]				// read 64 bits
			movq	mm1, [esi+8]
			movq	[edi+0], mm0				// write 64 bits
				movq	[edi+8], mm1				//    note:  the normal movq writes the
				movq	mm2, [esi+16]				//    data to cache; a cache line will be
			movq	mm3, [esi+24]				//    allocated as needed, to store the data
			movq	[edi+16], mm2
				movq	[edi+24], mm3
				movq	mm0, [esi+32]
			movq	mm1, [esi+40]
			movq	[edi+32], mm0
				movq	[edi+40], mm1
				movq	mm2, [esi+48]
			movq	mm3, [esi+56]
			movq	[edi+48], mm2
				movq	[edi+56], mm3

				add		esi, 64						// update source pointer
				add		edi, 64						// update destination pointer
				dec		ecx							// count down
				jnz		$memcpy_ic_1				// last 64-byte block?

$memcpy_ic_2:
			mov		ecx, ebx					// has valid low 6 bits of the byte count
$memcpy_ic_3:
			shr		ecx, 2						// dword count
				and		ecx, 1111b					// only look at the "remainder" bits
				neg		ecx							// set up to jump into the array
				add		ecx, offset $memcpy_last_few
				jmp		ecx							// jump to array of movsd's

$memcpy_uc_test:
			cmp		ecx, UNCACHED_COPY/64		// big enough? use block prefetch copy
				jae		$memcpy_bp_1

$memcpy_64_test:
			or		ecx, ecx					// tail end of block prefetch will jump here
				jz		$memcpy_ic_2				// no more 64-byte blocks left

				// For larger blocks, which will spill beyond the cache, it's faster to
				// use the Streaming Store instruction MOVNTQ.   This write instruction
				// bypasses the cache and writes straight to main memory.  This code also
				// uses the software prefetch instruction to pre-read the data.
				align 16
$memcpy_uc_1:							// 64-byte blocks, uncached copy

			prefetchnta [esi + (200*64/34+192)]	// start reading ahead

			movq	mm0,[esi+0]					// read 64 bits
			add		edi,64						// update destination pointer
				movq	mm1,[esi+8]
			add		esi,64						// update source pointer
				movq	mm2,[esi-48]
			movntq	[edi-64], mm0				// write 64 bits, bypassing the cache
				movq	mm0,[esi-40]				//    note: movntq also prevents the CPU
			movntq	[edi-56], mm1				//    from READING the destination address
				movq	mm1,[esi-32]				//    into the cache, only to be over-written
			movntq	[edi-48], mm2				//    so that also helps performance
				movq	mm2,[esi-24]
			movntq	[edi-40], mm0
				movq	mm0,[esi-16]
			movntq	[edi-32], mm1
				movq	mm1,[esi-8]
			movntq	[edi-24], mm2
				movntq	[edi-16], mm0
				dec		ecx
				movntq	[edi-8], mm1
				jnz		$memcpy_uc_1				// last 64-byte block?

				jmp		$memcpy_ic_2				// almost done

				// For the largest size blocks, a special technique called Block Prefetch
				// can be used to accelerate the read operations.   Block Prefetch reads
				// one address per cache line, for a series of cache lines, in a short loop.
				// This is faster than using software prefetch, in this case.
				// The technique is great for getting maximum read bandwidth,
				// especially in DDR memory systems.
$memcpy_bp_1:							// large blocks, block prefetch copy

			cmp		ecx, CACHEBLOCK				// big enough to run another prefetch loop?
				jl		$memcpy_64_test				// no, back to regular uncached copy

				mov		eax, CACHEBLOCK / 2			// block prefetch loop, unrolled 2X
				add		esi, CACHEBLOCK * 64		// move to the top of the block
				align 16
$memcpy_bp_2:
			mov		edx, [esi-64]				// grab one address per cache line
			mov		edx, [esi-128]				// grab one address per cache line
			sub		esi, 128					// go reverse order
				dec		eax							// count down the cache lines
				jnz		$memcpy_bp_2				// keep grabbing more lines into cache

				mov		eax, CACHEBLOCK				// now that it's in cache, do the copy
				align 16
$memcpy_bp_3:
			movq	mm0, [esi   ]				// read 64 bits
			movq	mm1, [esi+ 8]
			movq	mm2, [esi+16]
			movq	mm3, [esi+24]
			movq	mm4, [esi+32]
			movq	mm5, [esi+40]
			movq	mm6, [esi+48]
			movq	mm7, [esi+56]
			add		esi, 64						// update source pointer
				movntq	[edi   ], mm0				// write 64 bits, bypassing cache
				movntq	[edi+ 8], mm1				//    note: movntq also prevents the CPU
				movntq	[edi+16], mm2				//    from READING the destination address 
				movntq	[edi+24], mm3				//    into the cache, only to be over-written,
				movntq	[edi+32], mm4				//    so that also helps performance
				movntq	[edi+40], mm5
				movntq	[edi+48], mm6
				movntq	[edi+56], mm7
				add		edi, 64						// update dest pointer

				dec		eax							// count down

				jnz		$memcpy_bp_3				// keep copying
				sub		ecx, CACHEBLOCK				// update the 64-byte block count
				jmp		$memcpy_bp_1				// keep processing chunks

				// The smallest copy uses the X86 "movsd" instruction, in an optimized
				// form which is an "unrolled loop".   Then it handles the last few bytes.
				align 4
				movsd
				movsd								// perform last 1-15 dword copies
				movsd
				movsd
				movsd
				movsd
				movsd
				movsd
				movsd
				movsd								// perform last 1-7 dword copies
				movsd
				movsd
				movsd
				movsd
				movsd
				movsd

$memcpy_last_few:						// dword aligned from before movsd's
			mov		ecx, ebx					// has valid low 2 bits of the byte count
				and		ecx, 11b					// the last few cows must come home
				jz		$memcpy_final				// no more, let's leave
				rep		movsb						// the last 1, 2, or 3 bytes

$memcpy_final: 
			emms								// clean up the MMX state
				sfence								// flush the write buffer
				mov		eax, [dest]					// ret value = destination pointer

		}
	}

	fvoid SSE2Memcpy( fvoid* dest , const fvoid* src , const fulong count )
	{

		__asm
		{
			mov esi, src;    //src pointer
			mov edi, dest;   //dest pointer

			mov ebx, count; //ebx is our counter 
			shr ebx, 7;      //divide by 128 (8 * 128bit registers)


loop_copy:
			prefetchnta 128[ESI]; //SSE2 prefetch
			prefetchnta 160[ESI];
			prefetchnta 192[ESI];
			prefetchnta 224[ESI];

			movdqa xmm0, 0[ESI]; //move data from src to registers
			movdqa xmm1, 16[ESI];
			movdqa xmm2, 32[ESI];
			movdqa xmm3, 48[ESI];
			movdqa xmm4, 64[ESI];
			movdqa xmm5, 80[ESI];
			movdqa xmm6, 96[ESI];
			movdqa xmm7, 112[ESI];

			movntdq 0[EDI], xmm0; //move data from registers to dest
			movntdq 16[EDI], xmm1;
			movntdq 32[EDI], xmm2;
			movntdq 48[EDI], xmm3;
			movntdq 64[EDI], xmm4;
			movntdq 80[EDI], xmm5;
			movntdq 96[EDI], xmm6;
			movntdq 112[EDI], xmm7;

			add esi, 128;
			add edi, 128;
			dec ebx;

			jnz loop_copy; //loop please
loop_copy_end:
		}
	}


};

